설치과정은 교수님 홈페이지를 참고하였습니다. https://statkclee.github.io/data-science/ds-postgreSQL.html
C:\Program Files\PostgreSQL\12CREATE DATABASE dvd; C:\Program Files\PostgreSQL\12\binpg_restore -U postgres -d dvd C:\Users\MINJI\MJ\2019-2\Data_Engineering\HW\dvdrental.tar데이터 추가
pgAdmin 실행
ER 다이어그램
#install.packages('RPostgreSQL')
#install.packages('DBI')
library(RPostgreSQL)
## Loading required package: DBI
library(DBI)
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.2.1 √ purrr 0.3.3
## √ tibble 2.1.3 √ dplyr 0.8.3
## √ tidyr 1.0.0 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(datamodelr)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
pgdrv <- dbDriver("PostgreSQL")
con <- dbConnect(pgdrv, dbname="dvd",
port="5432",
user="postgres",
password=1111,
host="localhost")
qry <- "SELECT *
FROM pg_catalog.pg_tables"
posible_table <- dbGetQuery(con, qry) %>% filter(schemaname == 'public')
posible_table$tablename
## [1] "actor" "store" "address" "category"
## [5] "city" "country" "customer" "film_actor"
## [9] "film_category" "inventory" "language" "rental"
## [13] "staff" "payment" "film"
col_qry <- "SELECT table_name,
STRING_AGG(column_name, ', ') AS columns
FROM information_schema.columns
WHERE table_schema = 'public'
GROUP BY table_name;"
dbGetQuery(con, col_qry) %>%
DT::datatable()
category <- dbGetQuery(con, "SELECT * FROM category")
film_category <- dbGetQuery(con, "SELECT * FROM film_category")
film <- dbGetQuery(con, "SELECT * FROM film")
## Warning in postgresqlExecStatement(conn, statement, ...): RS-DBI driver
## warning: (unrecognized PostgreSQL field type tsvector (id:3614) in column
## 12)
language <- dbGetQuery(con, "SELECT * FROM language")
inventory <- dbGetQuery(con, "SELECT * FROM inventory")
rental <- dbGetQuery(con, "SELECT * FROM rental")
payment <- dbGetQuery(con, "SELECT * FROM payment")
staff <- dbGetQuery(con, "SELECT * FROM staff")
actor <- dbGetQuery(con, "SELECT * FROM actor")
customer <- dbGetQuery(con, "SELECT * FROM customer")
address <- dbGetQuery(con, "SELECT * FROM address")
city <- dbGetQuery(con, "SELECT * FROM city")
country <- dbGetQuery(con, "SELECT * FROM country")
store <- dbGetQuery(con, "SELECT * FROM store")
film_actor <- dbGetQuery(con, "SELECT * FROM film_actor")
https://www.freecodecamp.org/news/project-1-analyzing-dvd-rentals-with-sql-fd12dd674a64/
모든 테이블을 원자료 그대로 불러와서 dplyr 패키지를 통해서 전처리하였다.
[ 기본 질문 ]
[ 추가 질문 ]
( 가장 많이/적게 대여된 장르는 무엇이며 그들의 총 매출액은 얼마인가 )
data_model <- dm_from_data_frames(category, film_category, film, inventory, rental, customer)
data_model <- dm_add_references(
data_model,
category$category_id == film_category$category_id,
film_category$film_id == film$film_id,
film$film_id == inventory$film_id,
inventory$inventory_id == rental$inventory_id,
rental$customer_id == customer$customer_id
)
graph <- dm_create_graph(data_model, rankdir='LR', col_attr = c("column", "type"))
dm_render_graph(graph)
data_model <- dm_from_data_frames(category, film_category, film, inventory, rental, payment)
data_model <- dm_add_references(
data_model,
category$category_id == film_category$category_id,
film_category$film_id == film$film_id,
film$film_id == inventory$film_id,
inventory$inventory_id == rental$inventory_id,
rental$rental_id == payment$rental_id
)
graph <- dm_create_graph(data_model, rankdir='LR', col_attr = c("column", "type"))
dm_render_graph(graph)
qry <- "WITH t1 AS (SELECT c.name AS Genre, COUNT(cu.customer_id) AS Total_rent_demand
FROM category c
JOIN film_category fc
USING(category_id)
JOIN film f
USING(film_id)
JOIN inventory i
USING(film_id)
JOIN rental r
USING(inventory_id)
JOIN customer cu
USING(customer_id)
GROUP BY 1
ORDER BY 2 DESC),
t2 AS (SELECT c.name AS Genre, SUM(p.amount) AS Total_sales
FROM category c
JOIN film_category fc
USING(category_id)
JOIN film f
USING(film_id)
JOIN inventory i
USING(film_id)
JOIN rental r
USING(inventory_id)
JOIN payment p
USING(rental_id)
GROUP BY 1
ORDER BY 2 DESC)
SELECT t1.genre, t1.total_rent_demand, t2.total_sales
FROM t1
JOIN t2
ON t1.genre = t2.genre;"
Q1 <- dbGetQuery(con, qry)
DT::datatable(Q1)
Q1_plot <- Q1 %>%
gather(variable, value, -genre) %>%
ggplot(data = ., aes(x=genre, y=value, group = variable, colour=variable, text=paste0("genre: ", genre, "\n", "value: ", value))) +
geom_line() +
geom_point() +
facet_wrap(~variable, scale="free") +
xlab('Genre') +
theme(axis.text.x = element_text(angle = 60, hjust = 1), legend.position="none")
ggplotly(Q1_plot, tooltip = "text")
각 장르의 distinct한 고객수
data_model <- dm_from_data_frames(category, film_category, film, inventory, rental, customer)
data_model <- dm_add_references(
data_model,
category$category_id == film_category$category_id,
film_category$film_id == film$film_id,
film$film_id == inventory$film_id,
inventory$inventory_id == rental$inventory_id,
rental$customer_id == customer$customer_id
)
graph <- dm_create_graph(data_model, rankdir='LR', col_attr = c("column", "type"))
dm_render_graph(graph)
qury <- "SELECT c.name AS Genre, COUNT(DISTINCT cu.customer_id) AS total_distinct_users
FROM category c
JOIN film_category fc
USING(category_id)
JOIN film f
USING(film_id)
JOIN inventory i
USING(film_id)
JOIN rental r
USING(inventory_id)
JOIN customer cu
USING(customer_id)
GROUP BY 1
ORDER BY 2 DESC;"
Q2 <- dbGetQuery(con, qury)
DT::datatable(Q2)
tmp1 <- Q1 %>% gather(variable, value, -genre)
tmp2 <- Q2 %>% gather(variable, value, -genre)
Q2_plot <- rbind(tmp1, tmp2) %>%
ggplot(data = ., aes(x=genre, y=value, group = variable, colour=variable, text=paste0("genre: ", genre, "\n", "value: ", value))) +
geom_line() +
geom_point() +
facet_wrap(~variable, scale="free") +
xlab('Genre') +
theme(axis.text.x = element_text(angle = 60, hjust = 1), legend.position="none")
ggplotly(Q2_plot, tooltip = "text")
1번의 결과들과 비교해보았을 때, 1번에서 Music 장르가 가장 적은 대여량(total_rent_demand)와 가장 적은 매출액(total_sales)을 기록했음에도 불구하고, 2번에서 가장 적은 고객수(total_distinct_users)를 기록하지는 않았다.
Travel 장르가 가장 적은 고객수를 기록하였다.이를 통해 알 수 있는 점은 Travel 장르의 재대여횟수가 Music 장르보다 많았다는 것이다. 1번에서 총매출액과 대여량이 가장 많았던 Sports 장르는 고객수 또한 가장 많았다.
이를 통해 Rent A Film의 고객 중 대부분이 Sports 장르에 관심이 많음을 알 수 있다.
각 장르의 평균대여료
data_model <- dm_from_data_frames(category, film_category, film)
data_model <- dm_add_references(
data_model,
category$category_id == film_category$category_id,
film_category$film_id == film$film_id
)
graph <- dm_create_graph(data_model, rankdir='LR', col_attr = c("column", "type"))
dm_render_graph(graph)
qry <- "SELECT c.name AS Genre, ROUND(AVG(f.rental_rate),2) AS average_rental_rate
FROM category c
JOIN film_category fc
USING(category_id)
JOIN film f
USING(film_id)
GROUP BY 1
ORDER BY 2 DESC;"
Q3 <- dbGetQuery(con, qry)
DT::datatable(Q3)
Q3_plot <- Q3 %>%
ggplot(data = ., aes(x=genre, y=average_rental_rate, text=paste0("genre: ", genre, "\n", "average_rental_rate: ", average_rental_rate))) +
geom_line(group=1) +
geom_point() +
xlab('Genre') +
theme(axis.text.x = element_text(angle = 60, hjust = 1), legend.position="none")
ggplotly(Q3_plot, tooltip = "text")
tmp <- left_join(Q1, Q2, by='genre') %>%
left_join(., Q3, by='genre') %>%
select(., total_rent_demand, total_sales, total_distinct_users, average_rental_rate)
plot(tmp)
cor(tmp)
## total_rent_demand total_sales total_distinct_users
## total_rent_demand 1.0000000 0.8344022 0.9248721
## total_sales 0.8344022 1.0000000 0.8366973
## total_distinct_users 0.9248721 0.8366973 1.0000000
## average_rental_rate -0.2771723 0.2047283 -0.1727086
## average_rental_rate
## total_rent_demand -0.2771723
## total_sales 0.2047283
## total_distinct_users -0.1727086
## average_rental_rate 1.0000000
1, 2번에서 살펴보았던 총대여량(total_rent_demand), 총매출액(total_sales), 총고객수(total_distinct_users)와 평균대여료(average_rental_rate)를 비교해보았다. 총대여량과 총매출액, 총고객수끼리는 상관관계가 있지만 총대여량과 총매출액, 총고객수와 평균대여료는 상관관계가 없다.
평균대여료가 높다고 해서 그것을 찾는 사람이 적은 것은 아니라고 말할 수 있다.
반납 시기(일찍/제때/연체)에 따른 대여량
data_model <- dm_from_data_frames(film, inventory, rental)
data_model <- dm_add_references(
data_model,
film$film_id == inventory$film_id
)
graph <- dm_create_graph(data_model, rankdir='LR', col_attr = c("column", "type"))
dm_render_graph(graph)
qry <- "WITH t1 AS (SELECT *, DATE_PART('day', return_date - rental_date) AS date_difference
FROM rental),
t2 AS (SELECT rental_duration, date_difference,
CASE
WHEN rental_duration > date_difference THEN 'Returned early'
WHEN rental_duration = date_difference THEN 'Returned on Time'
ELSE 'Returned late'
END AS Return_status
FROM film f
JOIN inventory i
USING(film_id)
JOIN t1
USING(inventory_id))
SELECT Return_status, COUNT(*) AS total_no_of_films
FROM t2
GROUP BY 1
ORDER BY 2 DESC;"
Q4 <- dbGetQuery(con, qry)
DT::datatable(Q4)
Q4_plot <- Q4 %>%
ggplot(., aes(x=return_status, y=total_no_of_films))+geom_bar(stat='identity')
ggplotly(Q4_plot)
영화의 48 %가 마감일보다 일찍 반환되고 영화의 41 %가 늦게 반환되고 11 %가 제 때 반환된다. 대부분의 고객이 DVD를 일찍 반환하는 경향을 보인다.
일찍 반납하거나 제 때 반납하는 경우는 괜찮지만 늦게 반납하는 경우는 영업에 지장을 줄 수 있다. 그러나 그 비율이 41%나 되는 것을 생각해보았을 때, 늦게 반납한 경우 벌금을 부과하는 것은 추가 수입원이 될 수 있으며 반납을 일찍 하게 하는 요인이 될 수 있다.
(Rent A Film이 입점한 국가와 국가별로 몇 명의 고객들이 있는지 / 각 국가에 대한 총매출액)
data_model <- dm_from_data_frames(country, city, address, customer, payment)
data_model <- dm_add_references(
data_model,
country$country_id == city$country_id,
city$city_id == address$city_id,
address$address_id == customer$address_id,
customer$customer_id == payment$customer_id
)
graph <- dm_create_graph(data_model, rankdir='LR', col_attr = c("column", "type"))
dm_render_graph(graph)
qry <- "SELECT country,
COUNT(DISTINCT customer_id) AS customer_base,
SUM(amount) AS total_sales
FROM country
JOIN city
USING(country_id)
JOIN address
USING(city_id)
JOIN customer
USING(address_id)
JOIN payment
USING(customer_id)
GROUP BY 1
ORDER BY 2 DESC, 3 DESC;"
Q5 <- dbGetQuery(con, qry)
DT::datatable(Q5)
Q5_plot <- ggplot(Q5, aes(x=customer_base, y=total_sales, text=paste0("country: ", country, "\n", "customer_base: ", customer_base, "\n", "total_sales: ", total_sales))) + geom_point()
ggplotly(Q5_plot, tooltip='text')
Rent A Film은 108개의 서로 다른 국적을 가진 고객들을 보유하고 있다. 그 중 인도는 60명의 고객이 있으며 총매출액이 가장 높다. 아프가니스탄은 고객수가 가장 적은 국가가 아님에도 불구하고 이익측면에서 총매출액이 가장 적다.
( 총매출액이 가장 높은 5명의 고객을 파악하고 그들에게 보상할 때 필요한 세부 정보 파악하기 )
data_model <- dm_from_data_frames(customer, address, city, country, payment)
data_model <- dm_add_references(
data_model,
customer$address_id == address$address_id,
address$city_id == city$city_id,
city$country_id == country$country_id,
customer$customer_id == payment$customer_id
)
graph <- dm_create_graph(data_model, rankdir='LR', col_attr = c("column", "type"))
dm_render_graph(graph)
qry <- "WITH t1 AS (SELECT *,
first_name || ' ' || last_name AS full_name
FROM customer)
SELECT full_name,
email,
address,
phone,
city,
country,
SUM(amount) AS total_purchase_in_currency
FROM t1
JOIN address
USING(address_id)
JOIN city
USING(city_id)
JOIN country
USING(country_id)
JOIN payment
USING(customer_id)
GROUP BY 1,2,3,4,5,6
ORDER BY 7 DESC
LIMIT 5;"
Q6 <- dbGetQuery(con, qry)
DT::datatable(Q6)
VIP 고객에게 실제 선물을 보상하거나 보내려고 할 때 위의 정보는 고객의 이름, 주소, 이메일 등을 보여준다. 회사의 마케팅 팀에서 보상방법을 결정할 때 이 정보를 유용하게 사용할 수 있을 것이다.
data_model <- dm_from_data_frames(payment, customer)
data_model <- dm_add_references(
data_model,
customer$customer_id == payment$customer_id
)
graph <- dm_create_graph(data_model, rankdir='LR', col_attr = c("column", "type"))
dm_render_graph(graph)
qry <-
"SELECT COUNT(customer_id) AS num_customer
FROM customer
;"
dbGetQuery(con, qry)
## num_customer
## 1 599
qry <-
"SELECT active,
COUNT(customer_id) AS cnt
FROM customer
GROUP BY active
;"
dbGetQuery(con, qry)
## active cnt
## 1 0 15
## 2 1 584
총 고객수 599명 중 이탈 고객이 15명이고 잔존고객이 584명이다.
qry <-
"SELECT active,
COUNT(*) AS pay_cnt,
MIN(amount) AS min_amt,
AVG(amount) AS avg_amt,
MAX(amount) AS max_amt,
SUM(amount) AS total_amt
FROM payment AS p
LEFT JOIN customer AS c
ON p.customer_id = c.customer_id
GROUP BY c.active;"
dbGetQuery(con, qry)
## active pay_cnt min_amt avg_amt max_amt total_amt
## 1 0 369 0.99 4.092981 11.99 1510.31
## 2 1 14227 0.00 4.203397 11.99 59801.73
총 구매기록 14596건 중 이탈 고객의 구매기록이 369건이고 잔존고객의 구매기록이 14227건이다. 이탈 고객이나 잔존 고객이나 평균 대여료에는 큰 차이가 없다.
data_model <- dm_from_data_frames(category, film_category, film, inventory, rental)
data_model <- dm_add_references(
data_model,
category$category_id == film_category$category_id,
film_category$film_id == film$film_id,
film$film_id == inventory$film_id,
inventory$inventory_id == rental$inventory_id
)
graph <- dm_create_graph(data_model, rankdir='LR', col_attr = c("column", "type"))
dm_render_graph(graph)
qry <- "SELECT f.film_id, f.title AS movie_title, COUNT(f.title) AS num_rentals, c.name
FROM category c
JOIN film_category fc
USING(category_id)
JOIN film f
USING(film_id)
JOIN inventory i
USING(film_id)
JOIN rental r
USING(inventory_id)
GROUP BY f.film_id, f.title, c.name
ORDER BY num_rentals DESC
LIMIT 10;"
Q8 <- dbGetQuery(con, qry)
DT::datatable(Q8)
가장 대여횟수가 많은 film TOP 10은 위와같다. 거의 모든 장르가 골고루 존재하지만, 그중에서 Games 장르가 유일하게 TOP 10 내에 2개가 있다.
data_model <- dm_from_data_frames(actor, film_actor, film, inventory, rental)
data_model <- dm_add_references(
data_model,
actor$actor_id == film_actor$actor_id,
film_actor$film_id == film$film_id,
film$film_id == inventory$film_id,
inventory$inventory_id == rental$inventory_id
)
graph <- dm_create_graph(data_model, rankdir = "LR", col_attr = c("column", "type"))
dm_render_graph(graph)
qry <-
"SELECT actor_id, COUNT(film_id) AS film_cnt
FROM film_actor
GROUP BY actor_id
;"
Q9 <- dbGetQuery(con, qry)
summary(Q9$film_cnt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.00 24.00 27.00 27.31 31.00 42.00
배우들은 보통 27편 정도의 작품에 출연하며 가장 적게는 14편, 많게는 42편의 작품에 출연한다.
qry <- "WITH t1 as (SELECT f.film_id, f.title AS movie_title, COUNT(f.title) AS num_rentals
FROM film f
JOIN inventory i
USING(film_id)
JOIN rental r
USING(inventory_id)
GROUP BY f.film_id, f.title)
SELECT ff.actor_id, ffff.first_name, ffff.last_name, SUM(fff.num_rentals) as num_rentals
FROM film_actor as ff
LEFT JOIN t1 as fff
USING(film_id)
LEFT JOIN actor as ffff
USING(actor_id)
GROUP BY ff.actor_id, ffff.first_name, ffff.last_name
ORDER BY num_rentals DESC;"
Q9 <- dbGetQuery(con, qry)
DT::datatable(Q9)
Gina Degeneres가 출연한 작품에 대한 DVD 대여수가 753번으로 가장 많았다.
data_model <- dm_from_data_frames(category, film_category, film, inventory, rental)
data_model <- dm_add_references(
data_model,
category$category_id == film_category$category_id,
film_category$film_id == film$film_id,
film$film_id == inventory$film_id,
inventory$inventory_id == rental$inventory_id
)
graph <- dm_create_graph(data_model, rankdir='LR', col_attr = c("column", "type"))
dm_render_graph(graph)
qry <-
"
SELECT film_id, special_features
FROM film;"
special <- dbGetQuery(con, qry)
DT::datatable(special)
Trailers <- special %>%
filter(str_detect(special_features, "Trailers"))
Trailers$Trailers <- 1
Commentaries <- special %>%
filter(str_detect(special_features, "Commentaries"))
Commentaries$Commentaries <- 1
Deleted <- special %>%
filter(str_detect(special_features, "Deleted"))
Deleted$Deleted <- 1
Behind <- special %>%
filter(str_detect(special_features, "Behind"))
Behind$Behind <- 1
Q10 <- left_join(special, select(Trailers, film_id, Trailers), by="film_id") %>%
left_join(., select(Commentaries, film_id, Commentaries), by="film_id") %>%
left_join(., select(Deleted, film_id, Deleted), by="film_id") %>%
left_join(., select(Behind, film_id, Behind), by="film_id") %>%
replace(., is.na(.), 0)
select(Q10, Trailers, Commentaries, Deleted, Behind) %>%
summarise_each(funs(sum))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
## Trailers Commentaries Deleted Behind
## 1 535 539 503 538
전체 1000개의 film 중 각각의 tag를 가지는 film은 각각 535, 539, 503, 538개이다.
# 영화별 대여량
qry <- "SELECT f.film_id, f.title, COUNT(f.title) AS num_rentals
FROM film f
JOIN inventory i
USING(film_id)
JOIN rental r
USING(inventory_id)
GROUP BY f.film_id, f.title
ORDER BY num_rentals DESC;"
demand <- dbGetQuery(con, qry)
left_join(Q10, demand, by='film_id') %>%
group_by(Trailers, Commentaries, Deleted, Behind) %>%
summarise(., n=n())
## # A tibble: 15 x 5
## # Groups: Trailers, Commentaries, Deleted [8]
## Trailers Commentaries Deleted Behind n
## <dbl> <dbl> <dbl> <dbl> <int>
## 1 0 0 0 1 70
## 2 0 0 1 0 61
## 3 0 0 1 1 71
## 4 0 1 0 0 62
## 5 0 1 0 1 70
## 6 0 1 1 0 65
## 7 0 1 1 1 66
## 8 1 0 0 0 72
## 9 1 0 0 1 72
## 10 1 0 1 0 66
## 11 1 0 1 1 49
## 12 1 1 0 0 72
## 13 1 1 0 1 79
## 14 1 1 1 0 64
## 15 1 1 1 1 61
left_join(Q10, demand, by='film_id') %>%
group_by(Trailers, Commentaries, Deleted, Behind) %>%
summarise(., total_rentals = sum(num_rentals, na.rm=TRUE)) %>%
arrange(., desc(total_rentals))
## # A tibble: 15 x 5
## # Groups: Trailers, Commentaries, Deleted [8]
## Trailers Commentaries Deleted Behind total_rentals
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 0 1 1320
## 2 1 0 0 0 1151
## 3 1 1 0 0 1140
## 4 1 0 0 1 1137
## 5 0 0 0 1 1114
## 6 0 1 1 1 1112
## 7 0 1 0 0 1101
## 8 0 1 0 1 1093
## 9 1 0 1 0 1065
## 10 0 0 1 1 1049
## 11 0 0 1 0 1037
## 12 0 1 1 0 1020
## 13 1 1 1 1 1003
## 14 1 1 1 0 922
## 15 1 0 1 1 780
4개의 tag 중 적어도 하나의 tag를 가지므로 가능한 15가지 조합에 대해 그 개수와 대여량의 합을 구해보았다.
Trailers, Commentaris, Behind를 모두 포함하는 경우가 가장 많았으며, Trailers, Deleted, Behind를 모두 포함하는 경우가 가장 적었다. 역시나 총대여량에 있어서도 위의 조합들순으로 총대여량이 가장 높았고, 가장 적었다.
이를 통해 어떠한 tag 조합을 가질 때 더 많이 대여가 될 지 예상해볼 수 있다.
print(min(payment$payment_date))
## [1] "2007-02-14 21:21:59 KST"
print(max(payment$payment_date))
## [1] "2007-05-14 13:44:29 KST"
payment data는 2007년 2월 14일부터 2007년 5월 14일까지의 데이터이다.
pay <- select(payment, amount, payment_date)
pay$month <- strftime(pay$payment_date, '%m')
pay$day <- strftime(pay$payment_date, '%m-%d')
pay$hour <- strftime(pay$payment_date, '%H')
pay$weekday <- strftime(pay$payment_date, '%A')
head(pay)
## amount payment_date month day hour weekday
## 1 7.99 2007-02-15 22:25:46 02 02-15 22 목요일
## 2 1.99 2007-02-16 17:23:14 02 02-16 17 금요일
## 3 7.99 2007-02-16 22:41:45 02 02-16 22 금요일
## 4 2.99 2007-02-19 19:39:56 02 02-19 19 월요일
## 5 7.99 2007-02-20 17:31:48 02 02-20 17 화요일
## 6 5.99 2007-02-21 12:33:49 02 02-21 12 수요일
month_plot <- pay %>%
group_by(month) %>%
summarise(., sum_am = sum(amount)) %>%
ggplot(aes(x=month, y=sum_am, group=1,
text=paste0("month: ", month, "\n",
"sum_amount: ", sum_am))) +
geom_line() +
geom_point()
ggplotly(month_plot, tooltip = "text")
4월에 가장 많은 DVD 매출액이 있었다.
day_plot <- pay %>%
group_by(day) %>%
summarise(., sum_am = sum(amount)) %>%
ggplot(aes(x=day, y=sum_am, group=1,
text=paste0("day: ", day, "\n",
"sum_amount: ", sum_am))) +
geom_line() +
geom_point()+
theme(axis.text.x=element_text(angle=60,hjust=1))
ggplotly(day_plot, tooltip = "text")
일별로 살펴보니 4월 30일에 특출나게 많은 매출액을 보였다. 위에서 4월이 가장 매출액이 높았던 것은 이 날의 영향으로 보인다.
hour_plot <- pay %>%
group_by(hour) %>%
summarise(., sum_am = sum(amount)) %>%
ggplot(aes(x=hour, y=sum_am, group=1,
text=paste0("hour: ", hour, "\n",
"sum_amount: ", sum_am))) +
geom_line() +
geom_point()+
theme(axis.text.x=element_text(angle=60,hjust=1))
ggplotly(hour_plot, tooltip = "text")
13시나 17시에 매출액이 높았다. 여러 국가의 고객들이 존재하다보니 잘 활동하지 않는 새벽시간대에도 시차의 영향으로 인해 꽤나 많은 매출액을 차지하는 것처럼 보인다.
weekday_plot <- pay %>%
group_by(weekday) %>%
summarise(., sum_am = sum(amount)) %>%
mutate(name = fct_relevel(weekday,
"월요일", "화요일", "수요일", "목요일", "금요일", "토요일", "일요일")) %>%
ggplot(aes(x=name, y=sum_am, group=1,
text=paste0("weekday: ", weekday, "\n",
"sum_amount: ", sum_am))) +
geom_line() +
geom_point()+
theme(axis.text.x=element_text(angle=60,hjust=1))
ggplotly(weekday_plot, tooltip = "text")
data_model <- dm_from_data_frames(film, inventory, rental)
data_model <- dm_add_references(
data_model,
film$film_id == inventory$film_id,
inventory$inventory_id == rental$inventory_id
)
graph <- dm_create_graph(data_model, rankdir = "LR", col_attr = c("column", "type"))
dm_render_graph(graph)
qry <-
"SELECT r.rental_id, r.rental_date, f.rental_duration
FROM rental AS r
LEFT JOIN inventory AS i
ON r.inventory_id = i.inventory_id
LEFT JOIN film AS f
ON i.film_id = f.film_id
;"
rental_ <- dbGetQuery(con, qry)
rental_$rental_weekday <- strftime(rental_$rental_date, '%A')
DT::datatable(rental_)
Q12_plot <- rental_ %>%
group_by(rental_weekday, rental_duration) %>%
summarise(n=n()) %>%
ggplot(data = ., aes(x=match(rental_weekday, c("월요일", "화요일", "수요일", "목요일", "금요일", "토요일", "일요일")), y=n, group = rental_duration, colour=rental_duration, text=paste0("rental_weekday: ", rental_weekday, "\n", "value: ", n))) +
geom_line(group=1) +
geom_point() +
facet_wrap(~rental_duration, nrow=1) +
xlab('요일') +
scale_x_discrete(labels= c("월요일", "화요일", "수요일", "목요일", "금요일", "토요일", "일요일")) +
theme(axis.text.x = element_text(angle = 60, hjust = 1), legend.position="none")
ggplotly(Q12_plot, tooltip = "text")
대여가능일수가 3일, 4일, 7일이면 화요일에 빌리는 사람이 많고, 5일이면 토요일, 6일이면 화요일이나 일요일에 많이 빌리는 것을 알 수 있었다.
대체로 월요일과 수요일에 적게 빌리는 것을 확인할 수 있다.
# Database와의 연결 끊기
DBI::dbDisconnect(con)
## [1] TRUE